package com.wujunshen.core;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import com.wujunshen.dictionary.SynonymDictionary;
import com.wujunshen.enumation.SegmentationType;
import com.wujunshen.nature.NatureAttribute;
import java.io.BufferedReader;
import java.io.IOException;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.LinkedTransferQueue;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.SpecialPermission;

/**
 * @author wujunshen
 * @version 1.0
 * @since 1.0
 */
@Slf4j
public final class MyTokenizer extends Tokenizer {
    private static Segment nlpSegment;
    private static Segment indexSegment;

    static {
        SecurityManager sm = System.getSecurityManager();

        if (sm != null) {
            sm.checkPermission(new SpecialPermission());
        }

        AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
            Nature.create("auxiliary");

            return null;
        });
        AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
            nlpSegment = HanLP.newSegment()
                    // 词性标注
                    .enablePartOfSpeechTagging(true)
                    // 计算偏移量
                    .enableOffset(true)
                    // 中文人名识别
                    .enableNameRecognize(true)
                    // 日本人名识别
                    .enableJapaneseNameRecognize(true)
                    // 数量词识别
                    .enableNumberQuantifierRecognize(true)
                    // 机构名识别
                    .enableOrganizationRecognize(true)
                    // 音译人名识别
                    .enableTranslatedNameRecognize(true);

            indexSegment = HanLP.newSegment()
                    .enableIndexMode(true)
                    // 词性标注
                    .enablePartOfSpeechTagging(true)
                    // 计算偏移量
                    .enableOffset(true);

            // 在此处显示调用一下分词，使得加载词典、缓存词典的操作可以正确执行
            log.info(String.valueOf(nlpSegment.seg("HanLP中文分词工具包！")));
            log.info(String.valueOf(indexSegment.seg("HanLP中文分词工具包！")));

            return null;
        });
    }

    private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
    private final PositionIncrementAttribute positionIncrementAttribute =
            addAttribute(PositionIncrementAttribute.class);
    private final NatureAttribute natureAttribute = addAttribute(NatureAttribute.class);
    private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
    private final Queue<MyTerm> terms = new LinkedTransferQueue<>();
    private final Queue<String> tokens = new LinkedTransferQueue<>();
    private final SegmentationType segmentationType;
    private BufferedReader reader = null;

    public MyTokenizer(SegmentationType segmentationType) {
        this.segmentationType = segmentationType;
    }

    @Override
    public boolean incrementToken() throws IOException {
        String token = getToken();

        if (token != null) {
            charTermAttribute.setEmpty().append(token);

            return true;
        }

        return false;
    }

    private List<MyTerm> seg(String text) {
        Set<MyTerm> termSet = segAndConvert(nlpSegment, text);

        if ((segmentationType == SegmentationType.INDEX) || (segmentationType == SegmentationType.SYNONYM)) {
            Set<MyTerm> indexTerms = segAndConvert(indexSegment, text);
            termSet.addAll(indexTerms);
        }

        List<MyTerm> finalResults = new ArrayList<>(termSet);
        Comparator<MyTerm> offsetComparator = Comparator.comparing(t -> t.offset);
        Comparator<MyTerm> lengthComparator = Comparator.comparing(t -> t.word.length(), Comparator.reverseOrder());

        finalResults.sort(offsetComparator.thenComparing(lengthComparator));

        return finalResults;
    }

    private Set<MyTerm> segAndConvert(Segment segment, String text) {
        return segment.seg(text).stream().map(MyTerm::new).collect(Collectors.toSet());
    }

    private MyTerm getTerm() throws IOException {
        MyTerm term = terms.poll();

        if (term == null) {
            if (reader == null) {
                reader = new BufferedReader(input);
            }

            int length;
            StringBuilder sb = new StringBuilder();
            char[] buffer = new char[500];

            while (-1 != (length = reader.read(buffer, 0, buffer.length))) {
                sb.append(buffer, 0, length);
            }

            terms.addAll(seg(sb.toString()));
            term = terms.poll();
        }

        return term;
    }

    private String getToken() throws IOException {
        String token = tokens.poll();

        if (token == null) {
            Term term = getTerm();

            if (term != null) {
                int positionIncrement = 1;

                offsetAttribute.setOffset(term.offset, term.offset + term.word.length());
                positionIncrementAttribute.setPositionIncrement(positionIncrement);
                natureAttribute.setNature(term.nature);
                typeAttribute.setType(term.nature.toString());

                if (segmentationType == SegmentationType.SYNONYM) {
                    SynonymDictionary.get(term.word).forEach(tokens::offer);
                } else {
                    tokens.offer(term.word);
                }

                token = tokens.poll();
            }
        }

        return token;
    }
}
